This code covers chapter 2 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar.
This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.
Load the iris data set (set of records as a data.frame)
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Inspect data (plot for data.frames actually uses pairs plot). Possibly you can see noise and ouliers.
plot(iris, col=iris$Species)
Get summary statistics for each column (outliers, missing values)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Are there duplicate entries?
i <- duplicated(iris)
i
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Which object is a duplicate?
which(i)
## [1] 143
iris[i,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 143 5.8 2.7 5.1 1.9 virginica
See also ? unique and ? complete.cases: Often you will do something like:
clean.data <- unique(iris[complete.cases(iris),])
summary(clean.data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.00 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.80 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.00 Median :4.300 Median :1.300
## Mean :5.844 Mean :3.06 Mean :3.749 Mean :1.195
## 3rd Qu.:6.400 3rd Qu.:3.30 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.40 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :49
##
##
##
Note that one case (non-unique) is gone. All cases with missing values will also have been removed.
Aggregate by species (using mean or median)
aggregate(. ~ Species, data = iris, FUN = mean)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
aggregate(. ~ Species, data = iris, FUN = median)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.0 3.4 1.50 0.2
## 2 versicolor 5.9 2.8 4.35 1.3
## 3 virginica 6.5 3.0 5.55 2.0
Uses the formula interface . ~ Species means all (.) depending on feature Species.
id <- sample(1:nrow(iris), 20)
id
## [1] 51 109 57 10 145 100 58 112 147 16 63 47 114 45 39 117 129
## [18] 120 14 140
s <- iris[id,]
plot(s, col=s$Species)
You need to install the package sampling with: install.packages(“sampling”)
library(sampling)
id2 <- strata(iris, stratanames="Species", size=c(5,5,5), method="srswor")
id2
## Species ID_unit Prob Stratum
## 1 setosa 1 0.1 1
## 3 setosa 3 0.1 1
## 21 setosa 21 0.1 1
## 25 setosa 25 0.1 1
## 42 setosa 42 0.1 1
## 57 versicolor 57 0.1 2
## 65 versicolor 65 0.1 2
## 66 versicolor 66 0.1 2
## 81 versicolor 81 0.1 2
## 85 versicolor 85 0.1 2
## 112 virginica 112 0.1 3
## 117 virginica 117 0.1 3
## 129 virginica 129 0.1 3
## 135 virginica 135 0.1 3
## 143 virginica 143 0.1 3
s2 <- iris[id2$ID_unit,]
plot(s2, col=s2$Species)
Look at data first
library(scatterplot3d)
scatterplot3d(iris[,1:3], color=as.integer(iris$Species))
Intereactive 3d plots (needs package rgl)
#library(rgl)
#plot3d(as.matrix(iris[,1:3]), col=as.integer(iris$Species), size=5)
Intereactive 3d plots (needs package plotly)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(iris, x = Sepal.Length, y= Petal.Length, z = Sepal.Width,
size = Petal.Width, color = Species, type="scatter3d", mode="markers")
Calculate the principal components
pc <- prcomp(as.matrix(iris[,1:4]))
How important is each principal component?
plot(pc)
Inspect the raw object (display structure)
str(pc)
## List of 5
## $ sdev : num [1:4] 2.056 0.493 0.28 0.154
## $ rotation: num [1:4, 1:4] 0.3614 -0.0845 0.8567 0.3583 -0.6566 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 5.84 3.06 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ scale : logi FALSE
## $ x : num [1:150, 1:4] -2.68 -2.71 -2.89 -2.75 -2.73 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
plot(pc$x, col=iris$Species) # plot the first 2 principal components
Plot the projected data and add the original dimensions as arrows
biplot(pc, col = c("grey", "red"))
We will talk about feature selection when we discuss classification models.
plot(iris$Petal.Width, 1:150, ylab="index")
A histogram is a better visualization for the distribution of a single variable.
hist(iris$Petal.Width)
Equal interval width
cut(iris$Sepal.Width, breaks=3)
## [1] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6]
## [8] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [15] (3.6,4.4] (3.6,4.4] (3.6,4.4] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
## [22] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [29] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
## [36] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]
## [43] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4]
## [50] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8]
## [57] (2.8,3.6] (2,2.8] (2.8,3.6] (2,2.8] (2,2.8] (2.8,3.6] (2,2.8]
## [64] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8]
## [71] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8] (2.8,3.6] (2.8,3.6] (2,2.8]
## [78] (2.8,3.6] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8] (2,2.8] (2,2.8]
## [85] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8] (2.8,3.6] (2,2.8] (2,2.8]
## [92] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [99] (2,2.8] (2,2.8] (2.8,3.6] (2,2.8] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [106] (2.8,3.6] (2,2.8] (2.8,3.6] (2,2.8] (2.8,3.6] (2.8,3.6] (2,2.8]
## [113] (2.8,3.6] (2,2.8] (2,2.8] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2,2.8]
## [120] (2,2.8] (2.8,3.6] (2,2.8] (2,2.8] (2,2.8] (2.8,3.6] (2.8,3.6]
## [127] (2,2.8] (2.8,3.6] (2,2.8] (2.8,3.6] (2,2.8] (3.6,4.4] (2,2.8]
## [134] (2,2.8] (2,2.8] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [141] (2.8,3.6] (2.8,3.6] (2,2.8] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]
## [148] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## Levels: (2,2.8] (2.8,3.6] (3.6,4.4]
Other methods (equal frequency, k-means clustering, etc.)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
discretize(iris$Petal.Width, method="interval", categories=3)
## [1] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [8] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [15] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [22] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [29] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [36] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [43] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [50] [0.1,0.9) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [57] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [64] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [71] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [78] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [85] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [92] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [99] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## Levels: [0.1,0.9) [0.9,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method="frequency", categories=3)
## [1] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [8] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [15] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [22] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [29] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [36] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [43] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
## [50] [0.1,1.0) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [57] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [64] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [71] [1.7,2.5] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [78] [1.7,2.5] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [85] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [92] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
## [99] [1.0,1.7) [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [1.0,1.7) [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## Levels: [0.1,1.0) [1.0,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method="cluster", categories=3)
## [1] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [6] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [11] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [16] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [21] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [26] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [31] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [36] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [41] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [46] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
## [51] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [56] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [61] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [66] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [71] [1.691,2.500] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [76] [0.785,1.691) [0.785,1.691) [1.691,2.500] [0.785,1.691) [0.785,1.691)
## [81] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [86] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [91] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [96] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [101] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [106] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [111] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [116] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691)
## [121] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [126] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691)
## [131] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691) [0.785,1.691)
## [136] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [141] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [146] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## Levels: [0.100,0.785) [0.785,1.691) [1.691,2.500]
hist(iris$Petal.Width,
main = "Discretization: interval", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="interval",
categories=3,onlycuts=TRUE), col="blue")
hist(iris$Petal.Width,
main = "Discretization: frequency", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="frequency",
categories=3,onlycuts=TRUE), col="blue")
hist(iris$Petal.Width,
main = "Discretization: cluster", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="cluster",
categories=3,onlycuts=TRUE), col="blue")
Standardize the scale of features to make them comparable. For each column the mean is subtracted (centering) and it is divided by the standard deviation (scaling). Now most values should be in [-3,3].
iris.scaled <- scale(iris[1:4])
head(iris.scaled)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
summary(iris.scaled)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
Note: R actually only uses dissimilarities/distances.
iris.scaled[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
Calculate distances matrices between the first 5 flowers (use only the 4 numeric columns).
dist(iris.scaled[1:5,], method="euclidean")
## 1 2 3 4
## 2 1.1722914
## 3 0.8427840 0.5216255
## 4 1.0999999 0.4325508 0.2829432
## 5 0.2592702 1.3818560 0.9882608 1.2459861
dist(iris.scaled[1:5,], method="manhattan")
## 1 2 3 4
## 2 1.3886674
## 3 1.2279853 0.7570306
## 4 1.5781768 0.6483657 0.4634868
## 5 0.3501915 1.4973323 1.3366502 1.6868417
dist(iris.scaled[1:5,], method="maximum")
## 1 2 3 4
## 2 1.1471408
## 3 0.6882845 0.4588563
## 4 0.9177126 0.3622899 0.2294282
## 5 0.2294282 1.3765690 0.9177126 1.1471408
Note: Don’t forget to scale the data if the ranges are very different!
b <- rbind(
c(0,0,0,1,1,1,1,0,0,1),
c(0,0,1,1,1,0,0,1,0,0)
)
b
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0 0 0 1 1 1 1 0 0 1
## [2,] 0 0 1 1 1 0 0 1 0 0
Jaccard index
Jaccard index is a similarity measure so R reports 1-Jaccard
dist(b, method="binary")
## 1
## 2 0.7142857
Hamming distance
Hamming distance is the number of mis-matches
dist(b, method="manhattan")
## 1
## 2 5
Works with mixed data
data <- data.frame(
height= c( 160, 185, 170),
weight= c( 52, 90, 75),
sex= c( "female", "male", "male")
)
data
## height weight sex
## 1 160 52 female
## 2 185 90 male
## 3 170 75 male
library(proxy)
##
## Attaching package: 'proxy'
##
## The following object is masked from 'package:Matrix':
##
## as.matrix
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
dist(data, method="Gower")
## 1 2
## 2 1.0000000
## 3 0.6684211 0.3315789
library(proxy)
names(pr_DB$get_entries())
## [1] "Jaccard" "Kulczynski1" "Kulczynski2"
## [4] "Mountford" "Fager" "Russel"
## [7] "simple matching" "Hamman" "Faith"
## [10] "Tanimoto" "Dice" "Phi"
## [13] "Stiles" "Michael" "Mozley"
## [16] "Yule" "Yule2" "Ochiai"
## [19] "Simpson" "Braun-Blanquet" "cosine"
## [22] "eJaccard" "correlation" "Chi-squared"
## [25] "Phi-squared" "Tschuprow" "Cramer"
## [28] "Pearson" "Gower" "Euclidean"
## [31] "Mahalanobis" "Bhjattacharyya" "Manhattan"
## [34] "supremum" "Minkowski" "Canberra"
## [37] "Wave" "divergence" "Kullback"
## [40] "Bray" "Soergel" "Levenshtein"
## [43] "Podani" "Chord" "Geodesic"
## [46] "Whittaker" "Hellinger" "fJaccard"
Pearson correlation between features (columns)
cor(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
plot(iris$Petal.Length, iris$Petal.Width)
cor(iris$Petal.Length, iris$Petal.Width)
## [1] 0.9628654
cor.test(iris$Petal.Length, iris$Petal.Width)
##
## Pearson's product-moment correlation
##
## data: iris$Petal.Length and iris$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9490525 0.9729853
## sample estimates:
## cor
## 0.9628654
plot(iris$Sepal.Length, iris$Sepal.Width)
cor(iris$Sepal.Length, iris$Sepal.Width)
## [1] -0.1175698
cor.test(iris$Sepal.Length, iris$Sepal.Width)
##
## Pearson's product-moment correlation
##
## data: iris$Sepal.Length and iris$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.27269325 0.04351158
## sample estimates:
## cor
## -0.1175698
Correlation between objects (transpose matrix first)
cc <- cor(t(iris[,1:4]))
dim(cc)
## [1] 150 150
cc[1:10,1:10]
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000000 0.9959987 0.9999739 0.9981685 0.9993473 0.9995861
## [2,] 0.9959987 1.0000000 0.9966071 0.9973966 0.9922327 0.9935919
## [3,] 0.9999739 0.9966071 1.0000000 0.9983335 0.9990611 0.9993773
## [4,] 0.9981685 0.9973966 0.9983335 1.0000000 0.9967188 0.9978326
## [5,] 0.9993473 0.9922327 0.9990611 0.9967188 1.0000000 0.9998833
## [6,] 0.9995861 0.9935919 0.9993773 0.9978326 0.9998833 1.0000000
## [7,] 0.9988112 0.9907206 0.9984377 0.9961394 0.9999140 0.9997226
## [8,] 0.9995381 0.9971181 0.9996045 0.9995456 0.9985032 0.9991788
## [9,] 0.9980766 0.9985463 0.9983561 0.9998333 0.9960309 0.9972157
## [10,] 0.9965520 0.9990329 0.9969856 0.9993068 0.9937612 0.9952606
## [,7] [,8] [,9] [,10]
## [1,] 0.9988112 0.9995381 0.9980766 0.9965520
## [2,] 0.9907206 0.9971181 0.9985463 0.9990329
## [3,] 0.9984377 0.9996045 0.9983561 0.9969856
## [4,] 0.9961394 0.9995456 0.9998333 0.9993068
## [5,] 0.9999140 0.9985032 0.9960309 0.9937612
## [6,] 0.9997226 0.9991788 0.9972157 0.9952606
## [7,] 1.0000000 0.9979521 0.9952140 0.9927272
## [8,] 0.9979521 1.0000000 0.9994062 0.9983737
## [9,] 0.9952140 0.9994062 1.0000000 0.9997398
## [10,] 0.9927272 0.9983737 0.9997398 1.0000000
library("seriation") # for pimage
pimage(cc, main = "Correlation between objects")
Convert correlations into a dissimilarities
d <- as.dist(1-abs(cc))
pimage(d, main = "Dissimilaries between objects")
convert to ordinal variables with cut (see ? cut) into ordered factors with three levels
iris_ord <- data.frame(
cut(iris[,1], 3, labels=c("short", "medium", "long"), ordered=T),
cut(iris[,2], 3, labels=c("short", "medium", "long"), ordered=T),
cut(iris[,3], 3, labels=c("short", "medium", "long"), ordered=T),
cut(iris[,4], 3, labels=c("short", "medium", "long"), ordered=T),
iris[,5])
colnames(iris_ord) <- colnames(iris)
summary(iris_ord)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## short :59 short :47 short :50 short :50 setosa :50
## medium:71 medium:88 medium:54 medium:54 versicolor:50
## long :20 long :15 long :46 long :46 virginica :50
head(iris_ord$Sepal.Length)
## [1] short short short short short short
## Levels: short < medium < long
Kendall’s tau rank correlation coefficient
cor(sapply(iris_ord[,1:4], xtfrm), method="kendall")
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1437985 0.7418595 0.7295139
## Sepal.Width -0.1437985 1.0000000 -0.3298796 -0.3154474
## Petal.Length 0.7418595 -0.3298796 1.0000000 0.9198290
## Petal.Width 0.7295139 -0.3154474 0.9198290 1.0000000
Spearman’s rho
cor(sapply(iris_ord[,1:4], xtfrm), method="spearman")
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1569659 0.7937613 0.7843406
## Sepal.Width -0.1569659 1.0000000 -0.3662775 -0.3517262
## Petal.Length 0.7937613 -0.3662775 1.0000000 0.9399038
## Petal.Width 0.7843406 -0.3517262 0.9399038 1.0000000
Note: unfortunately we have to transform the ordered factors into numbers representing the order with xtfrm first.
Compare to the Pearson correlation on the original data
cor(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
Is sepal length and species related? Use cross tabulation
tbl <- table(Sepal.Length=iris_ord$Sepal.Length, iris_ord$Species)
tbl
##
## Sepal.Length setosa versicolor virginica
## short 47 11 1
## medium 3 36 32
## long 0 3 17
Test of Independence: Pearson’s chi-squared test is performed with the null hypothesis that the joint distribution of the cell counts in a 2-dimensional contingency table is the product of the row and column marginals. (h0 is independence)
chisq.test(tbl)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 111.63, df = 4, p-value < 2.2e-16
Using xtabs instead
x <- xtabs(~Sepal.Length + Species, data=iris_ord)
x
## Species
## Sepal.Length setosa versicolor virginica
## short 47 11 1
## medium 3 36 32
## long 0 3 17
summary(x)
## Call: xtabs(formula = ~Sepal.Length + Species, data = iris_ord)
## Number of cases in table: 150
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 111.63, df = 4, p-value = 3.262e-23
Groupwise averages
aggregate(Sepal.Length ~ Species, data=iris, FUN = mean)
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
aggregate(Sepal.Width ~ Species, data=iris, FUN = mean)
## Species Sepal.Width
## 1 setosa 3.428
## 2 versicolor 2.770
## 3 virginica 2.974
Just plotting the data is not very helpful
plot(iris$Petal.Length, jitter(rep(1, nrow(iris))), ylab ="", yaxt = "n")
Histograms work better
hist(iris$Petal.Length)
rug(iris$Petal.Length)
We can also add a kernel density estimate KDE (red line)
hist(iris$Petal.Length, freq=FALSE)
rug(iris$Petal.Length)
lines(density(iris$Petal.Length), col="red", lwd = 2)
Plot 2d kernel density estimate
library(MASS)
dens <- kde2d(iris$Sepal.Length, iris$Sepal.Width, n=100)
image(dens,
xlab="Sepal.Length", ylab="Sepal.Width")
contour(dens, add=TRUE)
points(jitter(iris$Sepal.Length, 2), jitter(iris$Sepal.Width, 2),
cex=.7, pch="+")
Use a 3d plot instead
persp(dens, xlab="Sepal.Length", ylab="Sepal.Width", zlab="density",
shade=.5)
# Use plotly
#library(plotly)
#plot_ly(dens, z = z, type = "surface")